import pandas as pd
import numpy as np
import warnings
import regex as re
warnings.filterwarnings('ignore')#to filter all the warnings
import seaborn as sns
pd.set_option('float_format', '{:.4f}'.format)# to keep the float values short
# Import for wordcloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
#import fot plotly
import plotly.express as px
US_Videos_df = pd.read_csv('US_youtube_trending_data.csv')
US_Videos_df.head(1)
| video_id | title | publishedAt | channelId | channelTitle | categoryId | trending_date | tags | view_count | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3C66w5Z0ixs | I ASKED HER TO BE MY GIRLFRIEND... | 2020-08-11T19:20:14Z | UCvtRTOMP2TqYqu51xNrqAzg | Brawadis | 22 | 2020-08-12T00:00:00Z | brawadis|prank|basketball|skits|ghost|funny vi... | 1514614 | 156908 | 5855 | 35313 | https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg | False | False | SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib... |
CA_Videos_df = pd.read_csv('CA_youtube_trending_data.csv')
CA_Videos_df.head(1)
| video_id | title | publishedAt | channelId | channelTitle | categoryId | trending_date | tags | view_count | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KX06ksuS6Xo | Diljit Dosanjh: CLASH (Official) Music Video |... | 2020-08-11T07:30:02Z | UCZRdNleCgW-BGUJf-bbjzQg | Diljit Dosanjh | 10 | 2020-08-12T00:00:00Z | clash diljit dosanjh|diljit dosanjh|diljit dos... | 9140911 | 296541 | 6180 | 30059 | https://i.ytimg.com/vi/KX06ksuS6Xo/default.jpg | False | False | CLASH official music video performed by DILJIT... |
GB_Videos_df = pd.read_csv('GB_youtube_trending_data.csv')
GB_Videos_df.head(1)
| video_id | title | publishedAt | channelId | channelTitle | categoryId | trending_date | tags | view_count | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | J78aPJ3VyNs | I left youtube for a month and THIS is what ha... | 2020-08-11T16:34:06Z | UCYzPXprvl5Y-Sf0g4vX-m6g | jacksepticeye | 24 | 2020-08-12T00:00:00Z | jacksepticeye|funny|funny meme|memes|jacksepti... | 2038853 | 353790 | 2628 | 40228 | https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg | False | False | I left youtube for a month and this is what ha... |
print('Shape of GB File: '+ str(GB_Videos_df.shape))
print('Shape of CA File: '+ str(CA_Videos_df.shape))
print('Shape of US File: '+ str(US_Videos_df.shape))
Shape of GB File: (92395, 16) Shape of CA File: (92344, 16) Shape of US File: (92391, 16)
The dataset we downloaded from Kaggle has two types of files for each country, one is 'video.csv' which contains all the features described before, and the other is 'category.json' which contains mapping for category id to category names.
To merge this information together, we did the following steps:
1. Load JSON File for each country
import json #import data using python json module
with open('US_category_id.json','r') as f:
category_data_us= json.loads(f.read())
with open('CA_category_id.json','r') as f:
category_data_ca= json.loads(f.read())
with open('GB_category_id.json','r') as f:
category_data_gb= json.loads(f.read())
2. Since JSON file was in nested format we used json normalize function from pandas to flatten it and read into data frame
US_cat = pd.json_normalize(category_data_us,record_path='items')
CA_cat = pd.json_normalize(category_data_ca,record_path='items')
GB_cat = pd.json_normalize(category_data_gb,record_path='items')
# Converting the 'id' extracted from the json file to type 'int'
US_cat['id']= US_cat['id'].astype(int)
CA_cat['id']= CA_cat['id'].astype(int)
GB_cat['id']= GB_cat['id'].astype(int)
3. Merging videos dataframe and category dataframe for all countries using left join
US_Videos_df= US_Videos_df.merge(US_cat,how ='left',left_on= 'categoryId',\
right_on='id').rename(columns= {'snippet.title':'category_name'})
CA_Videos_df= CA_Videos_df.merge(CA_cat,how ='left',left_on= 'categoryId',\
right_on='id').rename(columns= {'snippet.title':'category_name'})
GB_Videos_df= GB_Videos_df.merge(GB_cat,how ='left',left_on= 'categoryId',\
right_on='id').rename(columns= {'snippet.title':'category_name'})
print('Shape of GB File: '+ str(GB_Videos_df.shape))
print('Shape of CA File: '+ str(CA_Videos_df.shape))
print('Shape of US File: '+ str(US_Videos_df.shape))
Shape of GB File: (92395, 22) Shape of CA File: (92344, 22) Shape of US File: (92391, 22)
US_Videos_df['country']= 'USA'
CA_Videos_df['country']= 'Canada'
GB_Videos_df['country']= 'Great Britain'
US_Videos_df.head(1)
| video_id | title | publishedAt | channelId | channelTitle | categoryId | trending_date | tags | view_count | likes | ... | comments_disabled | ratings_disabled | description | kind | etag | id | category_name | snippet.assignable | snippet.channelId | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3C66w5Z0ixs | I ASKED HER TO BE MY GIRLFRIEND... | 2020-08-11T19:20:14Z | UCvtRTOMP2TqYqu51xNrqAzg | Brawadis | 22 | 2020-08-12T00:00:00Z | brawadis|prank|basketball|skits|ghost|funny vi... | 1514614 | 156908 | ... | False | False | SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib... | youtube#videoCategory | QMEBz6mxVdklVaq8JwesPEw_4nI | 22 | People & Blogs | True | UCBR8-60-B28hp2BmDPdntcQ | USA |
1 rows × 23 columns
df_list= [US_Videos_df,CA_Videos_df,GB_Videos_df]
df= pd.concat(df_list).reset_index(drop=True)
df.groupby('country')['video_id'].count()
country Canada 92344 Great Britain 92395 USA 92391 Name: video_id, dtype: int64
df.shape
(277130, 23)
df.drop(columns=['thumbnail_link','kind','etag','id','snippet.assignable','snippet.channelId','channelId'], axis='columns').shape
(277130, 16)
df.drop(columns=['thumbnail_link','kind','etag','id','snippet.assignable',\
'snippet.channelId','channelId'], axis='columns',inplace=True)
df.head(1)
| video_id | title | publishedAt | channelTitle | categoryId | trending_date | tags | view_count | likes | dislikes | comment_count | comments_disabled | ratings_disabled | description | category_name | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3C66w5Z0ixs | I ASKED HER TO BE MY GIRLFRIEND... | 2020-08-11T19:20:14Z | Brawadis | 22 | 2020-08-12T00:00:00Z | brawadis|prank|basketball|skits|ghost|funny vi... | 1514614 | 156908 | 5855 | 35313 | False | False | SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib... | People & Blogs | USA |
df[df.comments_disabled==True]['comment_count'].head()
53 0 131 0 134 0 242 0 276 0 Name: comment_count, dtype: int64
df[df.ratings_disabled==True][['likes','dislikes']].sample(5)
| likes | dislikes | |
|---|---|---|
| 50005 | 0 | 0 |
| 30607 | 0 | 0 |
| 187128 | 0 | 0 |
| 253366 | 0 | 0 |
| 144763 | 0 | 0 |
df[df.comments_disabled==True].comment_count.sum()
0
df[df.ratings_disabled==True][['likes','dislikes']].sum()
likes 0 dislikes 0 dtype: int64
df.drop(columns=['comments_disabled','ratings_disabled'],axis=1,inplace=True)
df.shape
(277130, 14)
df.isna().sum()
video_id 0 title 0 publishedAt 0 channelTitle 0 categoryId 0 trending_date 0 tags 0 view_count 0 likes 0 dislikes 0 comment_count 0 description 4541 category_name 164 country 0 dtype: int64
df[df.category_name.isna()].head()
| video_id | title | publishedAt | channelTitle | categoryId | trending_date | tags | view_count | likes | dislikes | comment_count | description | category_name | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 96874 | zvxrdwbmnHU | Nisswa Mayor Arrested for Involvement in Traff... | 2020-09-01T04:50:02Z | Lakeland PBS | 29 | 2020-09-03T00:00:00Z | Lakeland PBS|LPTV|PBS|Bemidji|Minnesota|MN | 345993 | 5829 | 172 | 3209 | Originally aired August 31, 2020For more infor... | NaN | Canada |
| 97107 | zvxrdwbmnHU | Nisswa Mayor Arrested for Involvement in Traff... | 2020-09-01T04:50:02Z | Lakeland PBS | 29 | 2020-09-04T00:00:00Z | Lakeland PBS|LPTV|PBS|Bemidji|Minnesota|MN | 408941 | 6578 | 187 | 3533 | Originally aired August 31, 2020For more infor... | NaN | Canada |
| 97325 | zvxrdwbmnHU | Nisswa Mayor Arrested for Involvement in Traff... | 2020-09-01T04:50:02Z | Lakeland PBS | 29 | 2020-09-05T00:00:00Z | Lakeland PBS|LPTV|PBS|Bemidji|Minnesota|MN | 447580 | 7233 | 198 | 3828 | Originally aired August 31, 2020For more infor... | NaN | Canada |
| 97555 | zvxrdwbmnHU | Nisswa Mayor Arrested for Involvement in Traff... | 2020-09-01T04:50:02Z | Lakeland PBS | 29 | 2020-09-06T00:00:00Z | Lakeland PBS|LPTV|PBS|Bemidji|Minnesota|MN | 469103 | 7607 | 208 | 4000 | Originally aired August 31, 2020For more infor... | NaN | Canada |
| 97671 | P5urIeEcuvA | Miley Cyrus quits veganism | 2020-09-05T18:17:10Z | Earthling Ed | 29 | 2020-09-07T00:00:00Z | earthlinged|earthling ed|vegan|veganism|why|be... | 132065 | 14258 | 380 | 2822 | ✺ find out more about what i do & sign up for ... | NaN | Canada |
df[df.categoryId==29].category_name
4474 Nonprofits & Activism
4690 Nonprofits & Activism
4897 Nonprofits & Activism
5132 Nonprofits & Activism
5383 Nonprofits & Activism
...
269102 NaN
274761 NaN
275009 NaN
275263 NaN
275501 NaN
Name: category_name, Length: 252, dtype: object
df[df.category_name.isna()]['title'].unique()
array(['Nisswa Mayor Arrested for Involvement in Traffic Stop',
'Miley Cyrus quits veganism',
'Fast Times At Ridgemont High | Virtual Table Read for CORE',
'[Full livestream] Watch the Countdown Global Launch, a call to action on climate change',
'RETURNING To YouTube? Our CHARITY FUNDRAISER! (Ft. Orla Gartland)',
'It Counts', 'Happy Birthday, #TeamTrees!',
'The cast of Glee pays tribute to Naya Rivera & Santana Lopez at the 32nd Annual GLAAD Media Awards',
'Color the Spectrum LIVE- Mark Rober and Jimmy Kimmel',
'Global Citizen VAX Live - Extended Concert Only on YouTube',
"I Was Hacked. But Now I'm BACK!",
'President Moon Jae-in & BTS at the Sustainable Development Goals Moment | United Nations (English)',
'BTS Shine Spotlight on the United Nations as Envoys of the President of the Republic of Korea',
'BTS (방탄소년단) at Global Citizen Live Concert - Permission to Dance | #GlobalCitizenLive',
'Coldplay and BTS Share New Song My Universe | Global Citizen Live',
'Coldplay – Fix You (Live with Billie Eilish and FINNEAS in New York City) | Global Citizen Live',
'BTS Performs Permission to Dance in Seoul to open Global Citizen Live | Global Citizen Live',
'Pandora Papers: An unprecedented leak exposes the inner workings of a shadow economy',
"Don't Choose Extinction - UNDP | United Nations | Jack Black | Climate Action",
'The mannii #tiktok #shorts',
'The Joe Wicks 24 Hour PE Challenge | Part 5 | Schools Finale',
'The Joe Wicks 24 Hour PE Challenge | Part 1',
'Save Ralph - A short film with Taika Waititi'], dtype=object)
df.category_name.fillna('Nonprofits & Activism').isna().any()
False
df.category_name.fillna('Nonprofits & Activism',inplace=True)
df.description.fillna('')
0 SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1 While running her own modding shop, Ramya Pare...
2 I left youtube for a month and this is what ha...
3 Subscribe to XXL → http://bit.ly/subscribe-xxl...
4 Transforming The LaBrant Family's empty white ...
...
277125 Mariah Carey, Khalid, & Kirk Franklin – Fall I...
277126 Amelia meets Ed Sheeran at Sam’s Chicken for a...
277127 Official visualiser for Westlife's 'My Hero', ...
277128 Seen something you like? Why not hunt for it i...
277129 Salma Hayek is an Academy Award-nominated actr...
Name: description, Length: 277130, dtype: object
df.description.fillna('').isna().any()
False
df.description.fillna('',inplace=True)
df.tags.sample(4)
69708 noahj456|noah|noahj|gaming 137719 [None] 235461 tommyinnit|vlog|in real life|minecraft|ksi|in|... 203409 farah dhukai|sal ali|pregnancy|pregnancy annou... Name: tags, dtype: object
df.title.sample(4)
69544 Undercover Boss is Ridiculous 200542 NEW CELEBRATION INCOMING!! I WAX LYRICAL FT RH... 136339 Demi Lovato: Dancing With the Devil | Live Pre... 37366 Kele, Giovakartoons & Etervidos - Por Lo Mio (... Name: title, dtype: object
df.channelTitle.sample(4)
194847 Legend Of Winning 41811 Jack Harlow 228696 NPR Music 108201 The Herbert's Name: channelTitle, dtype: object
df.description.sample(4)
168633 La musique est disponible dès maintenant sur t... 69203 ► SUBSCRIBE to GameToons Gaming! -https://www.... 30547 Please subscribe to 【李子柒 Liziqi 】Liziqi Chann... 133396 Enjoy the best of the action as goals from Bru... Name: description, dtype: object
def isEnglish(s):
try:
s.encode('ascii')
except UnicodeEncodeError:
return False
else:
return True
isEnglish('slabiky, ale liší se podle významu')
False
isEnglish('This sentence is in English')
True
def removeNonEnglishWords(text):
filteredText = []
for word in text.split():
if isEnglish(word):
filteredText.append(word)
return " ".join(filteredText)
text = "‣ what was tekoi: tekoi commentary: old version of tekoi: crowdfundersbob kunz, john buchan, nevin spoljaric, donal botkin, bn-12, chris chapin, richard jenkins, phil gardner, martin, steven grimm, سليمان العقل, david f watson, colin millions, saki comandao, ben schwab, jason lewandowski, marco arment, shantanu raj, rictic, emptymachine, george lin, henry ng, thunda plum, awoo, david tyler, fuesu, iulus, jordan earls, joshua jamison, nick fish, nick gibson, tyler bryant, zach whittle, oliver steele, kermit norlund, kevin costello, derek bonner, derek jackson, mikko , orbit_junkie, ron bowes, tómas árni jónasson, bryan mclemore, alex simonides, felix weis, melvin sowah, christopher mutchler, giulio bontadini, paul alom, ryan tripicchio, scot melville, bear, chrysilis, david palomares, emil, erik parasiuk, esteban santana santana, freddi hørlyck, john rogers, leon, peter lomax, rhys parry, shiroiyami, tristan watts-willis, veronica peshterianu, dag viggo lokøen, john lee, maxime zielony, julien dubois, elizabeth keathley, nicholas welna## musicdavid rees:"
print(removeNonEnglishWords(text))
what was tekoi: tekoi commentary: old version of tekoi: crowdfundersbob kunz, john buchan, nevin spoljaric, donal botkin, bn-12, chris chapin, richard jenkins, phil gardner, martin, steven grimm, david f watson, colin millions, saki comandao, ben schwab, jason lewandowski, marco arment, shantanu raj, rictic, emptymachine, george lin, henry ng, thunda plum, awoo, david tyler, fuesu, iulus, jordan earls, joshua jamison, nick fish, nick gibson, tyler bryant, zach whittle, oliver steele, kermit norlund, kevin costello, derek bonner, derek jackson, mikko , orbit_junkie, ron bowes, bryan mclemore, alex simonides, felix weis, melvin sowah, christopher mutchler, giulio bontadini, paul alom, ryan tripicchio, scot melville, bear, chrysilis, david palomares, emil, erik parasiuk, esteban santana santana, freddi john rogers, leon, peter lomax, rhys parry, shiroiyami, tristan watts-willis, veronica peshterianu, dag viggo john lee, maxime zielony, julien dubois, elizabeth keathley, nicholas welna## musicdavid rees:
df['isEnglish'] = df.description.apply(lambda s: isEnglish(s))
df.isEnglish.value_counts()
False 179161 True 97969 Name: isEnglish, dtype: int64
df[df.isEnglish==False].description
0 SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1 While running her own modding shop, Ramya Pare...
2 I left youtube for a month and this is what ha...
3 Subscribe to XXL → http://bit.ly/subscribe-xxl...
5 Subscribe To My Channel - https://www.youtube....
...
277125 Mariah Carey, Khalid, & Kirk Franklin – Fall I...
277126 Amelia meets Ed Sheeran at Sam’s Chicken for a...
277127 Official visualiser for Westlife's 'My Hero', ...
277128 Seen something you like? Why not hunt for it i...
277129 Salma Hayek is an Academy Award-nominated actr...
Name: description, Length: 179161, dtype: object
df['des']= df.description.apply(removeNonEnglishWords)
df[['description','des']].sample(5)
| description | des | |
|---|---|---|
| 213701 | Gary Anderson preview Gezzy Price World Champi... | Gary Anderson preview Gezzy Price World Champi... |
| 167040 | Get your tickets to Roadkill Nights here►https... | Get your tickets to Roadkill Nights out my Dod... |
| 39639 | THE HERMES MCLAREN SPEEDTAIL IS FINALLY HERE!►... | THE HERMES MCLAREN SPEEDTAIL IS FINALLY Thanks... |
| 80936 | Videos like this are only possible with your h... | Videos like this are only possible with your h... |
| 276580 | I got my brother his dream apartment in London... | I got my brother his dream apartment in Subscr... |
df.drop(columns=['description','isEnglish'],axis=1,inplace=True)
df['isEnglish'] = df.tags.apply(lambda s: isEnglish(s))
df[df.isEnglish==False].tags.count()# finding the number of tags that consists of non english characters
22448
df['c_tags']= df.tags.apply(removeNonEnglishWords)
Now, the c_tags column has just the english words from tags column. hence, it is safe to drop isEnglish and tags columns from df
df.drop(columns=['isEnglish','tags'],axis=1,inplace=True)
df['isEnglish'] = df.title.apply(lambda s: isEnglish(s))
df[df.isEnglish==False].title.count()
40065
df['c_title']= df.title.apply(removeNonEnglishWords)
df.drop(columns=['isEnglish','title'],axis=1,inplace=True)
df['isEnglish'] = df.channelTitle.apply(lambda s: isEnglish(s))
df[df.isEnglish==False].channelTitle.count()
4009
df['channel_title']= df.channelTitle.apply(removeNonEnglishWords)
df.drop(columns=['channelTitle','isEnglish'],axis=1,inplace=True)
#remove links from the description
df.des= df.des.str.replace('http\S+|www.\S+',''\
,regex=True).str.replace('\r+',''\
,regex= True).str.lower()
#removing Punctuation from description
df.des = df.des.str.replace(r'[^\w\s]+', '')
#replacing | from tags with a space and converting the text to lowercase
df.c_tags= df.c_tags.str.replace('|', ' ').str.lower()
#removing punctuation if any from tags
df.c_tags = df.c_tags.str.replace(r'[^\w\s]+', '')
#Replacing None in tags to ''
df.c_tags = df.c_tags.str.replace('none','')
df[df.c_tags=='none'].head(1)
| video_id | publishedAt | categoryId | trending_date | view_count | likes | dislikes | comment_count | category_name | country | des | c_tags | c_title | channel_title |
|---|
#replacing | from title with a space and converting the text to lowercase
df.c_title= df.c_title.str.replace('|', ' ').str.lower()
#remove links from the title
df.c_title= df.c_title.str.replace('http\S+|www.\S+',''\
,regex=True).str.replace('\r+',''\
,regex= True).str.lower()
#removing punctuation if any from title
df.c_title = df.c_title.str.replace(r'[^\w\s]+', '')
#replacing | from channel_title with a space
df.channel_title= df.channel_title.str.replace('|', ' ')
#removing punctuation if any from title
df.channel_title = df.channel_title.str.replace(r'[^\w\s]+', '')
df.isna().any().sum()
0
df.sample(2)
| video_id | publishedAt | categoryId | trending_date | view_count | likes | dislikes | comment_count | category_name | country | des | c_tags | c_title | channel_title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 108745 | qZjfDbu6hnE | 2020-10-29T22:00:09Z | 10 | 2020-11-02T00:00:00Z | 4117846 | 213258 | 5208 | 7317 | Music | Canada | becky g ozuna no drama official video musica... | becky g becky g 2020 becky g and ozuna becky g... | becky g ozuna no drama official video | BeckyGVEVO |
| 108276 | UthWoHf8wWQ | 2020-10-27T22:54:10Z | 24 | 2020-10-31T00:00:00Z | 675633 | 18884 | 772 | 2848 | Entertainment | Canada | o blocks king von is the latest artist to appe... | king von o block chicago 63rd civiltv karen ci... | civil tv king von welcome to my neighborhood o... | Karen Civil |
df.dtypes
video_id object publishedAt object categoryId int64 trending_date object view_count int64 likes int64 dislikes int64 comment_count int64 category_name object country object des object c_tags object c_title object channel_title object dtype: object
df['publishedAt']=pd.to_datetime(df.publishedAt)
df['publishedAt'] = df['publishedAt'].dt.tz_convert(None)
df['trending_date']=pd.to_datetime(df.trending_date)
df['trending_date'] = df['trending_date'].dt.tz_convert(None)
df['country']= df['country'].astype('category')
df.dtypes
video_id object publishedAt datetime64[ns] categoryId int64 trending_date datetime64[ns] view_count int64 likes int64 dislikes int64 comment_count int64 category_name object country category des object c_tags object c_title object channel_title object dtype: object
df.isna().any().sum()
0
df.rename(columns={"publishedAt": "published_at","categoryId" : "category_id"\
,'des':'description','c_tags':'tags','c_title':'video_title'}, inplace = True)
df.head(1)
| video_id | published_at | category_id | trending_date | view_count | likes | dislikes | comment_count | category_name | country | description | tags | video_title | channel_title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3C66w5Z0ixs | 2020-08-11 19:20:14 | 22 | 2020-08-12 | 1514614 | 156908 | 5855 | 35313 | People & Blogs | USA | subscribe to brawadis follow me on social twi... | brawadis prank basketball skits ghost funny vi... | i asked her to be my girlfriend | Brawadis |
df[df.likes>df.view_count].sample(2)
| video_id | published_at | category_id | trending_date | view_count | likes | dislikes | comment_count | category_name | country | description | tags | video_title | channel_title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 73992 | HcSwBJY7Xew | 2021-08-10 15:08:22 | 27 | 2021-08-11 | 0 | 50071 | 7256 | 0 | Education | USA | watch the weeknd and create short videos on th... | YouTube | ||
| 74492 | Hb3rmh-_FMw | 2021-08-10 15:04:25 | 27 | 2021-08-13 | 0 | 22030 | 1604 | 0 | Education | USA | epilepsy warning watch and create short videos... | introducing the shorter side of youtube | YouTube |
df.drop(df[df.likes>df.view_count].index,inplace=True)
Resetting index after dropping few rows
df.reset_index(drop=True, inplace=True)
df.set_index('video_id',inplace=True)
df.to_csv('Clean_Dataset_final.csv.zip')
df.shape
(277115, 13)
Resetting index back as we set index to Video_id for writing the file to csv
df.reset_index(inplace = True)
df.head(1)
| video_id | published_at | category_id | trending_date | view_count | likes | dislikes | comment_count | category_name | country | description | tags | video_title | channel_title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3C66w5Z0ixs | 2020-08-11 19:20:14 | 22 | 2020-08-12 | 1514614 | 156908 | 5855 | 35313 | People & Blogs | USA | subscribe to brawadis follow me on social twi... | brawadis prank basketball skits ghost funny vi... | i asked her to be my girlfriend | Brawadis |
df_US = df[df.country == 'USA']
df_US = df_US.drop_duplicates(subset=['video_id'], keep='last')
df_US_category_counts = df_US.groupby(['trending_date', 'category_name'], as_index=False)['view_count'].sum()
df_US_category_counts['trending_date'] = pd.to_datetime(df_US_category_counts['trending_date']).dt.date
df_US_news_count = df_US_category_counts[df_US_category_counts.category_name == 'News & Politics']
labels = {'view_count': 'View Count (Millions)', 'trending_date': 'Trending Date'}
fig = px.line(df_US_news_count, x='trending_date', y='view_count', title='View Count Time Series for category: News & Politics (USA)', labels=labels)
fig.update_xaxes(rangeslider_visible=True)
fig.update_layout(title_x=0.5)
fig.show()
# To increase the figure size
plt.rcParams['figure.figsize'] = [10, 5]
# Adding unwanted words and social media tags to stopword list:
stopwords = set(STOPWORDS)
stopwords.update(['follow', 'twitter', 'social', 'instagram', 'subscribe', 'snapchat', 'youtube', 'videos', 'video'\
,'channel', 'share', 'facebook', 'comment', 'like', 'take', 'go', 'got', 'back',\
'much', 'made', 'keep', 'watch','none', 'check', 'will', 'make'])
def generate_wordcloud(text, stop_words):
wordcloud = WordCloud(stopwords=stop_words,max_font_size=50, max_words=150, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
category = 'News & Politics'
dateMask = (df_US.trending_date > pd.Timestamp(2020,10,1)) & (df_US.trending_date < pd.Timestamp(2020,12,1))
tag_text = " ".join(text for text in df_US.tags[(df_US.category_name == category ) & (dateMask)])
title_text = " ".join(text for text in df_US.video_title[(df_US.category_name == category ) & (dateMask) ])
tag_title_text = tag_text + ' ' + title_text
generate_wordcloud(tag_title_text, stopwords)
category = 'News & Politics'
dateMask = (df_US.trending_date > pd.Timestamp(2021,4,30)) & (df_US.trending_date < pd.Timestamp(2021,6,1))
tag_text = " ".join(text for text in df_US.tags[(df_US.category_name == category ) & (dateMask)])
title_text = " ".join(text for text in df_US.video_title[(df_US.category_name == category ) & (dateMask) ])
tag_title_text = tag_text + ' ' + title_text
generate_wordcloud(tag_title_text, stopwords)
To find the popular categories by country
sns.countplot(y='category_name',data=df,hue='country',\
order=df.category_name.value_counts().iloc[:10].index,hue_order=['USA','Great Britain','Canada'])
<AxesSubplot:xlabel='count', ylabel='category_name'>
category_list = ['Entertainment', 'Sports']
for category in category_list:
print("Category: "+category)
tag_text = " ".join(text for text in df.tags[(df.category_name == category )& (df.country == 'USA')])
title_text = " ".join(text for text in df.video_title[(df.category_name == category )& (df.country == 'USA')])
tag_title_text = tag_text + ' ' + title_text
generate_wordcloud(tag_title_text, stopwords)
Category: Entertainment
Category: Sports
for category in category_list:
print("Category: "+category)
tag_text = " ".join(text for text in df.tags[(df.category_name == category )& (df.country == 'Great Britain')])
title_text = " ".join(text for text in df.video_title[(df.category_name == category )& (df.country == 'Great Britain')])
tag_title_text = tag_text + ' ' + title_text
generate_wordcloud(tag_title_text, stopwords)
Category: Entertainment
Category: Sports
df = pd.read_csv('Clean_Dataset_final.csv.zip')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 277115 entries, 0 to 277114 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 video_id 277115 non-null object 1 published_at 277115 non-null object 2 category_id 277115 non-null int64 3 trending_date 277115 non-null object 4 view_count 277115 non-null int64 5 likes 277115 non-null int64 6 dislikes 277115 non-null int64 7 comment_count 277115 non-null int64 8 category_name 277115 non-null object 9 country 277115 non-null object 10 description 272206 non-null object 11 tags 232288 non-null object 12 video_title 276962 non-null object 13 channel_title 276746 non-null object dtypes: int64(5), object(9) memory usage: 29.6+ MB
df.drop_duplicates(subset=['video_id'], keep='last', inplace = True)
df.drop(columns=['channel_title','video_id','published_at', 'category_id','trending_date',\
'view_count', 'likes', 'dislikes', 'comment_count', 'country'],axis=1,inplace=True)
df.fillna(value = '', inplace = True)
# Combining all text data in a single column:
df['All_text'] = df.description + ' ' + df.tags + ' ' + df.video_title
pd.set_option("display.max_colwidth", -1)
df.head(1)
| category_name | description | tags | video_title | All_text | |
|---|---|---|---|---|---|
| 172 | Howto & Style | today i show you how to make a curried egg sandwich curried egg sandwiches are my favourite snack they are easy to make and incredibly tasty unfortunately not many people know how to make one correctly that changes today enjoyclick here to eggscribe a video suggestion post it in the comments section contact me through my facebook page or tweet meconnect with mefacebook channel howtobasic shirts and egg plushies someone that likes curried egg sandwiches link this delicious recipe to them | how to make a curried egg sandwich curried egg egg sandwich recipe egg recipe how to make a sandwich curried egg sandwich recipe easy recipe curry recipe how to make curry food step by step recipes healthy recipes egg salad sandwich recipe | how to make a curried egg sandwich | today i show you how to make a curried egg sandwich curried egg sandwiches are my favourite snack they are easy to make and incredibly tasty unfortunately not many people know how to make one correctly that changes today enjoyclick here to eggscribe a video suggestion post it in the comments section contact me through my facebook page or tweet meconnect with mefacebook channel howtobasic shirts and egg plushies someone that likes curried egg sandwiches link this delicious recipe to them how to make a curried egg sandwich curried egg egg sandwich recipe egg recipe how to make a sandwich curried egg sandwich recipe easy recipe curry recipe how to make curry food step by step recipes healthy recipes egg salad sandwich recipe how to make a curried egg sandwich |
df.drop(columns=['description', 'tags', 'video_title'],axis=1,inplace=True)
df.head(1)
| category_name | All_text | |
|---|---|---|
| 172 | Howto & Style | today i show you how to make a curried egg sandwich curried egg sandwiches are my favourite snack they are easy to make and incredibly tasty unfortunately not many people know how to make one correctly that changes today enjoyclick here to eggscribe a video suggestion post it in the comments section contact me through my facebook page or tweet meconnect with mefacebook channel howtobasic shirts and egg plushies someone that likes curried egg sandwiches link this delicious recipe to them how to make a curried egg sandwich curried egg egg sandwich recipe egg recipe how to make a sandwich curried egg sandwich recipe easy recipe curry recipe how to make curry food step by step recipes healthy recipes egg salad sandwich recipe how to make a curried egg sandwich |
def preprocessing(text):
stemmer = PorterStemmer()
processed_text = ''
for word in text.split():
if not word in stopwords:
processed_text += stemmer.stem(word) + " "
return processed_text
test = "today i show you how to make a curried egg sandwich curried egg sandwiches are my favourite snack they are easy to make and incredibly tasty unfortunately not many people know how to make one correctly that changes today enjoyclick here to eggscribe a video suggestion post it in the comments section contact me through my facebook page or tweet meconnect with mefacebook channel howtobasic shirts and egg plushies someone that likes curried egg sandwiches link this delicious recipe to them how to make a curried egg sandwich curried egg egg sandwich recipe egg recipe how to make a sandwich curried egg sandwich recipe easy recipe curry recipe how to make curry food step by step recipes healthy recipes egg salad sandwich recipe how to make a curried egg sandwich"
preprocessing(test)
'today show curri egg sandwich curri egg sandwich favourit snack easi incred tasti unfortun mani peopl know one correctli chang today enjoyclick eggscrib suggest post comment section contact page tweet meconnect mefacebook howtobas shirt egg plushi someon like curri egg sandwich link delici recip curri egg sandwich curri egg egg sandwich recip egg recip sandwich curri egg sandwich recip easi recip curri recip curri food step step recip healthi recip egg salad sandwich recip curri egg sandwich '
df['All_text'] = df['All_text'].apply(preprocessing)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
df.category_name.value_counts()
Entertainment 6315 Sports 5719 Music 4172 Gaming 3805 People & Blogs 2527 Comedy 1588 News & Politics 1224 Howto & Style 923 Science & Technology 906 Film & Animation 754 Autos & Vehicles 584 Education 559 Travel & Events 170 Pets & Animals 148 Nonprofits & Activism 25 Name: category_name, dtype: int64
-df_filtered - the dataframe only with categories 'Entertainment', 'Sports', 'Music', 'Gaming','People & Blogs', 'Comedy','News & Politics'
df_filtered = df[df.category_name.isin(['Entertainment', 'Sports', 'Music', 'Gaming', \
'People & Blogs', 'Comedy','News & Politics'])]
X = df_filtered.All_text
Y = df_filtered.category_name
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state = 0)
# Applying bag of words to features in training and testing data
bag_of_words_creator = CountVectorizer()
X_train_bow = bag_of_words_creator.fit_transform(X_train)
X_test_bow = bag_of_words_creator.transform(X_test)
#cl = RandomForestClassifier(random_state = 0, n_estimators=1000)
cl = RandomForestClassifier(random_state = 0)
cl.fit(X_train_bow,Y_train)
RandomForestClassifier(random_state=0)
y_pred = cl.predict(X_test_bow)
from sklearn.metrics import confusion_matrix
import sklearn.metrics as met
confusion_matrix(Y_test,y_pred)
array([[ 278, 120, 19, 14, 4, 26, 8],
[ 26, 1575, 57, 93, 14, 94, 42],
[ 8, 43, 1075, 12, 0, 19, 12],
[ 9, 38, 5, 1155, 1, 6, 2],
[ 4, 23, 0, 7, 312, 6, 15],
[ 19, 200, 38, 60, 6, 448, 17],
[ 2, 27, 3, 7, 2, 14, 1640]])
print(met.classification_report(Y_test,y_pred))
precision recall f1-score support
Comedy 0.80 0.59 0.68 469
Entertainment 0.78 0.83 0.80 1901
Gaming 0.90 0.92 0.91 1169
Music 0.86 0.95 0.90 1216
News & Politics 0.92 0.85 0.88 367
People & Blogs 0.73 0.57 0.64 788
Sports 0.94 0.97 0.96 1695
accuracy 0.85 7605
macro avg 0.85 0.81 0.82 7605
weighted avg 0.85 0.85 0.85 7605
tfidf_creator = TfidfVectorizer()
X_train_tfidf = tfidf_creator.fit_transform(X_train)
X_test_tfidf = tfidf_creator.transform(X_test)
#cl = RandomForestClassifier(random_state = 0, n_estimators=1000)
cl = RandomForestClassifier(random_state = 0)
cl.fit(X_train_tfidf,Y_train)
RandomForestClassifier(random_state=0)
y_pred = cl.predict(X_test_tfidf)
confusion_matrix(Y_test,y_pred)
array([[ 266, 135, 26, 11, 2, 19, 10],
[ 23, 1603, 54, 85, 13, 80, 43],
[ 5, 55, 1074, 7, 0, 14, 14],
[ 5, 39, 3, 1161, 1, 5, 2],
[ 3, 26, 0, 7, 311, 7, 13],
[ 18, 227, 42, 57, 5, 420, 19],
[ 3, 35, 3, 4, 4, 7, 1639]])
print(met.classification_report(Y_test,y_pred))
precision recall f1-score support
Comedy 0.82 0.57 0.67 469
Entertainment 0.76 0.84 0.80 1901
Gaming 0.89 0.92 0.91 1169
Music 0.87 0.95 0.91 1216
News & Politics 0.93 0.85 0.88 367
People & Blogs 0.76 0.53 0.63 788
Sports 0.94 0.97 0.95 1695
accuracy 0.85 7605
macro avg 0.85 0.80 0.82 7605
weighted avg 0.85 0.85 0.85 7605
From this ML Model, We can find out the missing category of any video, based on the information obtained from title, tag and description
A platform like youtube can use this model to recommend the category to the users by doing real time analysis on title, tag and description while they are uploading their videos
df = pd.read_csv('Clean_Dataset_final.csv.zip')
ML_df= df.groupby(['video_id','trending_date','published_at'],as_index=False).agg({'view_count':\
'max','likes':'max','dislikes':'max','comment_count':'max'})
ML_df[ML_df.video_id=='9Lhbm87KmOc']
| video_id | trending_date | published_at | view_count | likes | dislikes | comment_count | |
|---|---|---|---|---|---|---|---|
| 25636 | 9Lhbm87KmOc | 2021-11-05 00:00:00 | 2021-11-04 15:00:08 | 1499810 | 76792 | 540 | 8114 |
| 25637 | 9Lhbm87KmOc | 2021-11-06 00:00:00 | 2021-11-04 15:00:08 | 2123516 | 96213 | 708 | 9845 |
| 25638 | 9Lhbm87KmOc | 2021-11-07 00:00:00 | 2021-11-04 15:00:08 | 2368472 | 104490 | 797 | 10495 |
| 25639 | 9Lhbm87KmOc | 2021-11-08 00:00:00 | 2021-11-04 15:00:08 | 2564579 | 111414 | 867 | 10945 |
| 25640 | 9Lhbm87KmOc | 2021-11-09 00:00:00 | 2021-11-04 15:00:08 | 2718668 | 116626 | 913 | 11368 |
| 25641 | 9Lhbm87KmOc | 2021-11-10 00:00:00 | 2021-11-04 15:00:08 | 2831812 | 120078 | 946 | 11617 |
df1 = ML_df.copy()
df1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 157671 entries, 0 to 157670 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 video_id 157671 non-null object 1 trending_date 157671 non-null object 2 published_at 157671 non-null object 3 view_count 157671 non-null int64 4 likes 157671 non-null int64 5 dislikes 157671 non-null int64 6 comment_count 157671 non-null int64 dtypes: int64(4), object(3) memory usage: 9.6+ MB
df1['trending_date'] = pd.to_datetime(df1['trending_date'])
df1['published_at'] = pd.to_datetime(df1['published_at'])
df1['trending_day_no'] = df1.groupby(['video_id'])["trending_date"].rank('first',ascending=True)
df1_count = df1.groupby('video_id',as_index=False)["trending_date"].count().rename(columns={\
'trending_date':'total_trending_days'})
df2 = df1.merge(df1_count, left_on='video_id', right_on='video_id')
df2.head()
| video_id | trending_date | published_at | view_count | likes | dislikes | comment_count | trending_day_no | total_trending_days | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | --0bCF-iK2E | 2021-07-04 | 2021-07-01 10:00:00 | 304339 | 8753 | 76 | 988 | 1.0000 | 5 |
| 1 | --0bCF-iK2E | 2021-07-05 | 2021-07-01 10:00:00 | 361948 | 10020 | 93 | 1044 | 2.0000 | 5 |
| 2 | --0bCF-iK2E | 2021-07-06 | 2021-07-01 10:00:00 | 398274 | 10750 | 100 | 1067 | 3.0000 | 5 |
| 3 | --0bCF-iK2E | 2021-07-07 | 2021-07-01 10:00:00 | 419194 | 11108 | 106 | 1077 | 4.0000 | 5 |
| 4 | --0bCF-iK2E | 2021-07-08 | 2021-07-01 10:00:00 | 433340 | 11276 | 110 | 1083 | 5.0000 | 5 |
df2.corr()
| view_count | likes | dislikes | comment_count | trending_day_no | total_trending_days | |
|---|---|---|---|---|---|---|
| view_count | 1.0000 | 0.8326 | 0.7419 | 0.4673 | 0.2451 | 0.3149 |
| likes | 0.8326 | 1.0000 | 0.6821 | 0.6792 | 0.1722 | 0.2439 |
| dislikes | 0.7419 | 0.6821 | 1.0000 | 0.4742 | 0.1503 | 0.1938 |
| comment_count | 0.4673 | 0.6792 | 0.4742 | 1.0000 | 0.0528 | 0.0804 |
| trending_day_no | 0.2451 | 0.1722 | 0.1503 | 0.0528 | 1.0000 | 0.4728 |
| total_trending_days | 0.3149 | 0.2439 | 0.1938 | 0.0804 | 0.4728 | 1.0000 |
df2[df2['video_id'] == '9Lhbm87KmOc']
| video_id | trending_date | published_at | view_count | likes | dislikes | comment_count | trending_day_no | total_trending_days | |
|---|---|---|---|---|---|---|---|---|---|
| 25636 | 9Lhbm87KmOc | 2021-11-05 | 2021-11-04 15:00:08 | 1499810 | 76792 | 540 | 8114 | 1.0000 | 6 |
| 25637 | 9Lhbm87KmOc | 2021-11-06 | 2021-11-04 15:00:08 | 2123516 | 96213 | 708 | 9845 | 2.0000 | 6 |
| 25638 | 9Lhbm87KmOc | 2021-11-07 | 2021-11-04 15:00:08 | 2368472 | 104490 | 797 | 10495 | 3.0000 | 6 |
| 25639 | 9Lhbm87KmOc | 2021-11-08 | 2021-11-04 15:00:08 | 2564579 | 111414 | 867 | 10945 | 4.0000 | 6 |
| 25640 | 9Lhbm87KmOc | 2021-11-09 | 2021-11-04 15:00:08 | 2718668 | 116626 | 913 | 11368 | 5.0000 | 6 |
| 25641 | 9Lhbm87KmOc | 2021-11-10 | 2021-11-04 15:00:08 | 2831812 | 120078 | 946 | 11617 | 6.0000 | 6 |
df2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 157671 entries, 0 to 157670 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 video_id 157671 non-null object 1 trending_date 157671 non-null datetime64[ns] 2 published_at 157671 non-null datetime64[ns] 3 view_count 157671 non-null int64 4 likes 157671 non-null int64 5 dislikes 157671 non-null int64 6 comment_count 157671 non-null int64 7 trending_day_no 157671 non-null float64 8 total_trending_days 157671 non-null int64 dtypes: datetime64[ns](2), float64(1), int64(5), object(1) memory usage: 12.0+ MB
df2['published_year'] = df2.published_at.dt.year
df2['published_month'] = df2.published_at.dt.month
df2['published_day'] = df2.published_at.dt.day
df2['published_hour'] = df2.published_at.dt.hour
df2['published_minute'] = df2.published_at.dt.minute
df2['published_week'] = df2.published_at.dt.week
df2.drop(columns=['video_id', 'trending_date', 'published_at'],inplace=True)
Now the dataset is ready for Machine learning
df2.head()
| view_count | likes | dislikes | comment_count | trending_day_no | total_trending_days | published_year | published_month | published_day | published_hour | published_minute | published_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 304339 | 8753 | 76 | 988 | 1.0000 | 5 | 2021 | 7 | 1 | 10 | 0 | 26 |
| 1 | 361948 | 10020 | 93 | 1044 | 2.0000 | 5 | 2021 | 7 | 1 | 10 | 0 | 26 |
| 2 | 398274 | 10750 | 100 | 1067 | 3.0000 | 5 | 2021 | 7 | 1 | 10 | 0 | 26 |
| 3 | 419194 | 11108 | 106 | 1077 | 4.0000 | 5 | 2021 | 7 | 1 | 10 | 0 | 26 |
| 4 | 433340 | 11276 | 110 | 1083 | 5.0000 | 5 | 2021 | 7 | 1 | 10 | 0 | 26 |
X = df2.drop('total_trending_days',axis=1)
Y = df2.total_trending_days
Build and Train
import sklearn.tree
dt = sklearn.tree.DecisionTreeRegressor(max_depth = 2)
dt.fit(X,Y)
DecisionTreeRegressor(max_depth=2)
Visualize the tree
Y.mean()
5.942811296941099
import sklearn.tree as tree
from IPython.display import Image
import pydotplus
dt_feature_names = list(X.columns)
dt_target_names = np.array(Y.unique(),dtype=np.string_)
tree.export_graphviz(dt, out_file='tree.dot',
feature_names=dt_feature_names, class_names=dt_target_names,
filled=True)
graph = pydotplus.graph_from_dot_file('tree.dot')
Image(graph.create_png())
Y.value_counts()
6 43230 5 36150 7 31073 4 18620 8 12960 3 5457 9 4410 10 1950 2 1650 1 846 11 605 12 144 36 72 24 48 23 46 15 45 20 40 13 39 18 36 17 34 34 34 16 32 29 29 14 28 27 27 26 26 21 21 19 19 Name: total_trending_days, dtype: int64
trending_days_binned = pd.cut(Y, bins= [0,4,9,40])
trending_days_binned_count = trending_days_binned.value_counts().rename_axis('Binned_Trending_Days').reset_index(name = 'Count')
sns.catplot( y='Count',x='Binned_Trending_Days',\
data=trending_days_binned_count,kind='bar', aspect = 2)
<seaborn.axisgrid.FacetGrid at 0x7f97fbf93610>
X = df2.drop('total_trending_days',axis=1)
Y = df2.total_trending_days
Splitting X and Y for training and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,
test_size=0.3,random_state=0)
Train and predict with LinearRegession
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X_train,Y_train)
LinearRegression()
y_pred = regr.predict(X_test)
y_pred
array([7.33726665, 6.41044498, 5.74882481, ..., 5.25869201, 6.51654887,
6.2732937 ])
print("Coefficients: \n", regr.coef_)
Coefficients: [ 6.78397362e-08 3.02357149e-07 -8.49373026e-06 -1.47446349e-06 3.89498091e-01 -2.97693789e-01 -1.26530235e-01 -6.13502341e-03 -1.61209659e-03 -2.99638272e-03 1.37144317e-02]
r2_score=sklearn.metrics.r2_score(Y_test, y_pred)
print("Coefficient of determination: %.2f" % r2_score)
Coefficient of determination: 0.26
(y_pred-Y_test).abs().mean()
1.1358580446855684
((y_pred-Y_test)**2).mean()
2.6792137098922284
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
regs = [Lasso(), ElasticNet(), DecisionTreeRegressor(), GradientBoostingRegressor(), MLPRegressor()]
Goal is to find the predictor which minimized the cross-validated MAD
from sklearn.model_selection import KFold
minMAD = 10000000
nfolds = 3
bestREG = ''
for reg in regs:
kf = KFold(n_splits=nfolds,random_state=0,shuffle=True)
mad = sklearn.model_selection.cross_val_score(reg,X,Y,\
cv=kf,scoring='neg_mean_absolute_error').mean()
# need the lowest scoring for mad
print (str(reg)[:25] + ' with mad= ' + str(mad) )
if mad < minMAD:
minMAD = mad
bestREG = reg
print('***********************************************')
print ('Best Regressor is... ' + str(bestREG)[:25] )
print('**********************')
print ('With MAD Score ' + str(minMAD))
Lasso() with mad= -1.1600336751190794 ElasticNet() with mad= -1.1361904501933953 DecisionTreeRegressor() with mad= -0.6818374970666768 GradientBoostingRegressor with mad= -0.9132557796936741 MLPRegressor() with mad= -377.8684605560439 *********************************************** Best Regressor is... MLPRegressor() ********************** With MAD Score -377.8684605560439